This data was made publicly available by IBM, and downloaded from
Kaggle.
- Data Integrity: This data does not include Personal
Identifiable Information like Phone numbers, Identity numbers or credit
card numbers, so users privacy is protected.
- Data location: Click this link to find
the dataset
- Data Organization: the data is organised as a .csv
file named “WA_Fn-UseC_-Telco-Customer-Churn.csv”.
- Tool Selection: I choose to use r, for its robust
statistical packages and clean data manipulation syntax.
library("tidyverse")
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.6
## ✔ forcats 1.0.1 ✔ stringr 1.6.0
## ✔ ggplot2 4.0.1 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.2
## ✔ purrr 1.2.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library ("caret")
## Loading required package: lattice
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
library("randomForest")
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
##
## The following object is masked from 'package:dplyr':
##
## combine
##
## The following object is masked from 'package:ggplot2':
##
## margin
telco_customer_churn <- read_csv("datasets/telco_customer_churn.csv")
## Rows: 7043 Columns: 21
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (17): customerID, gender, Partner, Dependents, PhoneService, MultipleLin...
## dbl (4): SeniorCitizen, tenure, MonthlyCharges, TotalCharges
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
print("Here is a list of the column names:")
## [1] "Here is a list of the column names:"
colnames(telco_customer_churn)
## [1] "customerID" "gender" "SeniorCitizen" "Partner"
## [5] "Dependents" "tenure" "PhoneService" "MultipleLines"
## [9] "InternetService" "OnlineSecurity" "OnlineBackup" "DeviceProtection"
## [13] "TechSupport" "StreamingTV" "StreamingMovies" "Contract"
## [17] "PaperlessBilling" "PaymentMethod" "MonthlyCharges" "TotalCharges"
## [21] "Churn"
telco_customer_churn <- telco_customer_churn %>%
rename( customer_id = customerID,
senior_citizen = SeniorCitizen,
partner = Partner,
dependents = Dependents,
phone_service = PhoneService,
multiple_lines = MultipleLines,
internet_service = InternetService,
online_security = OnlineSecurity,
online_backup = OnlineBackup,
device_protection = DeviceProtection,
tech_support = TechSupport,
streaming_tv = StreamingTV,
streaming_movies = StreamingMovies,
contract = Contract,
paperless_billing = PaperlessBilling,
payment_method = PaymentMethod,
monthly_charges = MonthlyCharges,
total_charges = TotalCharges,
churn = Churn)
print("Nineteen columns have been renamed here are the current column names:")
## [1] "Nineteen columns have been renamed here are the current column names:"
colnames(telco_customer_churn)
## [1] "customer_id" "gender" "senior_citizen"
## [4] "partner" "dependents" "tenure"
## [7] "phone_service" "multiple_lines" "internet_service"
## [10] "online_security" "online_backup" "device_protection"
## [13] "tech_support" "streaming_tv" "streaming_movies"
## [16] "contract" "paperless_billing" "payment_method"
## [19] "monthly_charges" "total_charges" "churn"
num_na <- telco_customer_churn %>%
summarise(across(everything(), ~ sum(is.na(.))))
num_na
## # A tibble: 1 × 21
## customer_id gender senior_citizen partner dependents tenure phone_service
## <int> <int> <int> <int> <int> <int> <int>
## 1 0 0 0 0 0 0 0
## # ℹ 14 more variables: multiple_lines <int>, internet_service <int>,
## # online_security <int>, online_backup <int>, device_protection <int>,
## # tech_support <int>, streaming_tv <int>, streaming_movies <int>,
## # contract <int>, paperless_billing <int>, payment_method <int>,
## # monthly_charges <int>, total_charges <int>, churn <int>
class(telco_customer_churn$total_charges)
## [1] "numeric"
telco_customer_churn <- telco_customer_churn %>%
mutate(total_charges = as.numeric(as.character(total_charges)))
class(telco_customer_churn$total_charges)
## [1] "numeric"
class(telco_customer_churn$churn)
## [1] "character"
telco_customer_churn$churn <- as.factor(telco_customer_churn$churn)
class(telco_customer_churn$churn)
## [1] "factor"
telco_customer_churn <- telco_customer_churn %>%
drop_na(total_charges)
num_na <- telco_customer_churn %>%
summarise(across(everything(), ~ sum(is.na(.))))
num_na
## # A tibble: 1 × 21
## customer_id gender senior_citizen partner dependents tenure phone_service
## <int> <int> <int> <int> <int> <int> <int>
## 1 0 0 0 0 0 0 0
## # ℹ 14 more variables: multiple_lines <int>, internet_service <int>,
## # online_security <int>, online_backup <int>, device_protection <int>,
## # tech_support <int>, streaming_tv <int>, streaming_movies <int>,
## # contract <int>, paperless_billing <int>, payment_method <int>,
## # monthly_charges <int>, total_charges <int>, churn <int>
num_duplicates <- sum(duplicated(telco_customer_churn))
num_duplicates
## [1] 0
telco_customer_churn <- telco_customer_churn %>%
select(-customer_id)
head(telco_customer_churn)
## # A tibble: 6 × 20
## gender senior_citizen partner dependents tenure phone_service multiple_lines
## <chr> <dbl> <chr> <chr> <dbl> <chr> <chr>
## 1 Female 0 Yes No 1 No No phone service
## 2 Male 0 No No 34 Yes No
## 3 Male 0 No No 2 Yes No
## 4 Male 0 No No 45 No No phone service
## 5 Female 0 No No 2 Yes No
## 6 Female 0 No No 8 Yes Yes
## # ℹ 13 more variables: internet_service <chr>, online_security <chr>,
## # online_backup <chr>, device_protection <chr>, tech_support <chr>,
## # streaming_tv <chr>, streaming_movies <chr>, contract <chr>,
## # paperless_billing <chr>, payment_method <chr>, monthly_charges <dbl>,
## # total_charges <dbl>, churn <fct>
write_csv(telco_customer_churn,"C:\\Users\\itumeleng\\Documents\\R_programs\\datasets\\processed\\clean_telco_customer_churn.csv")
summary(telco_customer_churn)
## gender senior_citizen partner dependents
## Length:7032 Min. :0.0000 Length:7032 Length:7032
## Class :character 1st Qu.:0.0000 Class :character Class :character
## Mode :character Median :0.0000 Mode :character Mode :character
## Mean :0.1624
## 3rd Qu.:0.0000
## Max. :1.0000
## tenure phone_service multiple_lines internet_service
## Min. : 1.00 Length:7032 Length:7032 Length:7032
## 1st Qu.: 9.00 Class :character Class :character Class :character
## Median :29.00 Mode :character Mode :character Mode :character
## Mean :32.42
## 3rd Qu.:55.00
## Max. :72.00
## online_security online_backup device_protection tech_support
## Length:7032 Length:7032 Length:7032 Length:7032
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## streaming_tv streaming_movies contract paperless_billing
## Length:7032 Length:7032 Length:7032 Length:7032
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## payment_method monthly_charges total_charges churn
## Length:7032 Min. : 18.25 Min. : 18.8 No :5163
## Class :character 1st Qu.: 35.59 1st Qu.: 401.4 Yes:1869
## Mode :character Median : 70.35 Median :1397.5
## Mean : 64.80 Mean :2283.3
## 3rd Qu.: 89.86 3rd Qu.:3794.7
## Max. :118.75 Max. :8684.8
num_customers <- telco_customer_churn %>%
summarise(total = n()) %>%
pull(total)
num_customers
## [1] 7032
churn_summary <- telco_customer_churn %>%
group_by(churn) %>%
summarise(average_tenure = mean(tenure),
monthly_avgerage = mean(monthly_charges),
count = n())
churn_summary <- churn_summary %>%
mutate(percentage = round(count / num_customers * 100,2))
churn_summary
## # A tibble: 2 × 5
## churn average_tenure monthly_avgerage count percentage
## <fct> <dbl> <dbl> <int> <dbl>
## 1 No 37.7 61.3 5163 73.4
## 2 Yes 18.0 74.4 1869 26.6
set.seed(42)
train_index <- createDataPartition(telco_customer_churn$churn, p = 0.8, list = FALSE)
train_data <- telco_customer_churn[train_index, ]
test_data <- telco_customer_churn[-train_index, ]
rf_model <- randomForest(churn ~ ., data = train_data, ntree = 100, importance = TRUE)
predictions <- predict(rf_model, test_data)
prob_predictions <- predict(rf_model, test_data, type = "prob")
custom_threshold <- 0.30
new_predictions <- ifelse(prob_predictions[, "Yes"] > custom_threshold, "Yes", "No")
new_predictions <- factor(new_predictions, levels = c("No", "Yes"))
conf_matrix <- confusionMatrix(predictions, test_data$churn, positive = "Yes")
new_conf_matrix <- confusionMatrix(new_predictions, test_data$churn, positive = "Yes")